import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
df = pd.read_csv('mobile data set - mobile data set.csv')
df.head(10)
| battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842 | 0 | 2.2 | 0 | 1 | 0 | 7 | 0.6 | 188 | 2 | 2 | 20 | 756 | 2549 | 9 | 7 | 19 | 0 | 0 | 1 | 1 |
| 1 | 1021 | 1 | 0.5 | 1 | 0 | 1 | 53 | 0.7 | 136 | 3 | 6 | 905 | 1988 | 2631 | 17 | 3 | 7 | 1 | 1 | 0 | 2 |
| 2 | 563 | 1 | 0.5 | 1 | 2 | 1 | 41 | 0.9 | 145 | 5 | 6 | 1263 | 1716 | 2603 | 11 | 2 | 9 | 1 | 1 | 0 | 2 |
| 3 | 615 | 1 | 2.5 | 0 | 0 | 0 | 10 | 0.8 | 131 | 6 | 9 | 1216 | 1786 | 2769 | 16 | 8 | 11 | 1 | 0 | 0 | 2 |
| 4 | 1821 | 1 | 1.2 | 0 | 13 | 1 | 44 | 0.6 | 141 | 2 | 14 | 1208 | 1212 | 1411 | 8 | 2 | 15 | 1 | 1 | 0 | 1 |
| 5 | 1859 | 0 | 0.5 | 1 | 3 | 0 | 22 | 0.7 | 164 | 1 | 7 | 1004 | 1654 | 1067 | 17 | 1 | 10 | 1 | 0 | 0 | 1 |
| 6 | 1821 | 0 | 1.7 | 0 | 4 | 1 | 10 | 0.8 | 139 | 8 | 10 | 381 | 1018 | 3220 | 13 | 8 | 18 | 1 | 0 | 1 | 3 |
| 7 | 1954 | 0 | 0.5 | 1 | 0 | 0 | 24 | 0.8 | 187 | 4 | 0 | 512 | 1149 | 700 | 16 | 3 | 5 | 1 | 1 | 1 | 0 |
| 8 | 1445 | 1 | 0.5 | 0 | 0 | 0 | 53 | 0.7 | 174 | 7 | 14 | 386 | 836 | 1099 | 17 | 1 | 20 | 1 | 0 | 0 | 0 |
| 9 | 509 | 1 | 0.6 | 1 | 2 | 1 | 9 | 0.1 | 93 | 5 | 15 | 1137 | 1224 | 513 | 19 | 10 | 12 | 1 | 0 | 0 | 0 |
df.shape
(2000, 21)
df.columns
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
'touch_screen', 'wifi', 'price_range'],
dtype='object')
'battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 battery_power 2000 non-null int64 1 blue 2000 non-null int64 2 clock_speed 2000 non-null float64 3 dual_sim 2000 non-null int64 4 fc 2000 non-null int64 5 four_g 2000 non-null int64 6 int_memory 2000 non-null int64 7 m_dep 2000 non-null float64 8 mobile_wt 2000 non-null int64 9 n_cores 2000 non-null int64 10 pc 2000 non-null int64 11 px_height 2000 non-null int64 12 px_width 2000 non-null int64 13 ram 2000 non-null int64 14 sc_h 2000 non-null int64 15 sc_w 2000 non-null int64 16 talk_time 2000 non-null int64 17 three_g 2000 non-null int64 18 touch_screen 2000 non-null int64 19 wifi 2000 non-null int64 20 price_range 2000 non-null int64 dtypes: float64(2), int64(19) memory usage: 328.2 KB
pc - Primary Camera mega pixels
fc - Front Camera mega pixels
sc_h - Screen Height of mobile in cm
sc_w - Screen Width of mobile in cm
m_dep - Mobile Depth in cm
px_width - Pixel Resolution Width
px_height - Pixel Resolution Height
ram - Random Access Memory in Mega Bytes
int_memory - Internal Memory in Giga Bytes
four_g - Has 4G or not
three_g - Has 3G or not
dual_sim - Has dual sim support or not
battery_power - Total energy a battery can store in one time measured in mAh
touch_screen - Has touch screen or not
clock_speed - speed at which microprocessor executes instructions
n_cores - Number of cores of processor
wifi - Has wifi or not
blue - Has bluetooth or not
mobile_wt - Weight of mobile phone
talk_time - longest time that a single battery charge will last when you are
price_range - This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).
df.isnull().sum()
battery_power 0 blue 0 clock_speed 0 dual_sim 0 fc 0 four_g 0 int_memory 0 m_dep 0 mobile_wt 0 n_cores 0 pc 0 px_height 0 px_width 0 ram 0 sc_h 0 sc_w 0 talk_time 0 three_g 0 touch_screen 0 wifi 0 price_range 0 dtype: int64
df.duplicated().sum()
0
df.describe()
| battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.0000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | ... | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 1238.518500 | 0.4950 | 1.522250 | 0.509500 | 4.309500 | 0.521500 | 32.046500 | 0.501750 | 140.249000 | 4.520500 | ... | 645.108000 | 1251.515500 | 2124.213000 | 12.306500 | 5.767000 | 11.011000 | 0.761500 | 0.503000 | 0.507000 | 1.500000 |
| std | 439.418206 | 0.5001 | 0.816004 | 0.500035 | 4.341444 | 0.499662 | 18.145715 | 0.288416 | 35.399655 | 2.287837 | ... | 443.780811 | 432.199447 | 1084.732044 | 4.213245 | 4.356398 | 5.463955 | 0.426273 | 0.500116 | 0.500076 | 1.118314 |
| min | 501.000000 | 0.0000 | 0.500000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.100000 | 80.000000 | 1.000000 | ... | 0.000000 | 500.000000 | 256.000000 | 5.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 851.750000 | 0.0000 | 0.700000 | 0.000000 | 1.000000 | 0.000000 | 16.000000 | 0.200000 | 109.000000 | 3.000000 | ... | 282.750000 | 874.750000 | 1207.500000 | 9.000000 | 2.000000 | 6.000000 | 1.000000 | 0.000000 | 0.000000 | 0.750000 |
| 50% | 1226.000000 | 0.0000 | 1.500000 | 1.000000 | 3.000000 | 1.000000 | 32.000000 | 0.500000 | 141.000000 | 4.000000 | ... | 564.000000 | 1247.000000 | 2146.500000 | 12.000000 | 5.000000 | 11.000000 | 1.000000 | 1.000000 | 1.000000 | 1.500000 |
| 75% | 1615.250000 | 1.0000 | 2.200000 | 1.000000 | 7.000000 | 1.000000 | 48.000000 | 0.800000 | 170.000000 | 7.000000 | ... | 947.250000 | 1633.000000 | 3064.500000 | 16.000000 | 9.000000 | 16.000000 | 1.000000 | 1.000000 | 1.000000 | 2.250000 |
| max | 1998.000000 | 1.0000 | 3.000000 | 1.000000 | 19.000000 | 1.000000 | 64.000000 | 1.000000 | 200.000000 | 8.000000 | ... | 1960.000000 | 1998.000000 | 3998.000000 | 19.000000 | 18.000000 | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 |
8 rows × 21 columns
# Count of unique values in the target variable (price_range)
print(df['price_range'].value_counts())
1 500 2 500 3 500 0 500 Name: price_range, dtype: int64
# Pairplot to visualize relationships between numerical features
sns.pairplot(df, hue='price_range', diag_kind='kde')
plt.show()
# Scatter plot for pc vs price_range
plt.figure(figsize=(8, 6))
sns.scatterplot(x='pc', y='price_range', data=df)
plt.xlabel('Primary Camera (Mega Pixels)')
plt.ylabel('Price Range')
plt.title('Price Range vs Primary Camera')
plt.show()
# Bar plot for pc vs price_range
plt.figure(figsize=(10, 6))
sns.barplot(x='price_range', y='pc', data=df)
plt.xlabel('Price Range')
plt.ylabel('Average Primary Camera (Mega Pixels)')
plt.title('Average Primary Camera for Each Price Range')
plt.show()
# Scatter plot for fc vs price_range
plt.figure(figsize=(8, 6))
sns.scatterplot(x='fc', y='price_range', data=df)
plt.xlabel('Front Camera (Mega Pixels)')
plt.ylabel('Price Range')
plt.title('Price Range vs Front Camera')
plt.show()
# Bar plot for fc vs price_range
plt.figure(figsize=(10, 6))
sns.barplot(x='price_range', y='fc', data=df)
plt.xlabel('Price Range')
plt.ylabel('Average Front Camera (Mega Pixels)')
plt.title('Average Front Camera for Each Price Range')
plt.show()
# Calculate the average battery_power for each price_range category
average_battery_power = df.groupby('price_range')['battery_power'].mean().reset_index()
# Horizontal bar plot for battery_power vs price_range
plt.figure(figsize=(10, 6))
sns.barplot(x='battery_power', y='price_range', data=average_battery_power, orient='h')
plt.xlabel('Average Battery Power (mAh)')
plt.ylabel('Price Range')
plt.title('Average Battery Power for Each Price Range')
plt.show()
# Calculate the average n_cores for each price_range category
average_n_cores = df.groupby('price_range')['n_cores'].mean().reset_index()
# Horizontal bar plot for n_cores vs price_range
plt.figure(figsize=(10, 6))
sns.barplot(x='n_cores', y='price_range', data=average_n_cores, orient='h')
plt.xlabel('Average Number of Cores')
plt.ylabel('Price Range')
plt.title('Average Number of Cores for Each Price Range')
plt.show()
# Extracting the data for the plot
px_width = df['px_width']
px_height = df['px_height']
price_range = df['price_range']
# Create a scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(px_width, px_height, c=price_range, cmap='viridis', marker='o', alpha=0.7)
# Set labels for axes
plt.xlabel('Pixel Resolution Width')
plt.ylabel('Pixel Resolution Height')
plt.title('Relationship between Pixel Resolution and Price Range')
# Adding a colorbar legend
cbar = plt.colorbar()
cbar.set_label('Price Range', rotation=270, labelpad=15)
plt.show()
# Group data by price_range and calculate the mean pixel resolution width and height for each category
grouped_df = df.groupby('price_range')[['px_width', 'px_height']].mean().reset_index()
# Set up the bar plot
plt.figure(figsize=(10, 6))
bar_width = 0.35
# Create bars for px_width and px_height
plt.bar(grouped_df['price_range'] - bar_width/2, grouped_df['px_width'], bar_width, label='Pixel Width')
plt.bar(grouped_df['price_range'] + bar_width/2, grouped_df['px_height'], bar_width, label='Pixel Height')
# Set labels for axes and title
plt.xlabel('Price Range')
plt.ylabel('Average Pixel Resolution')
plt.title('Average Pixel Resolution for Each Price Range Category')
plt.xticks(grouped_df['price_range'])
plt.legend()
plt.show()
# Calculate the average RAM for each price_range category
average_ram_by_price_range = df.groupby('price_range')['ram'].mean().reset_index()
# Create a bar plot to visualize the relationship between ram and price_range
plt.figure(figsize=(8, 6))
sns.barplot(x='price_range', y='ram', data=average_ram_by_price_range)
plt.xlabel('Price Range')
plt.ylabel('Average RAM (Random Access Memory in MB)')
plt.title('Relationship between RAM and Price Range')
plt.show()
# Create a count plot to visualize the relationship between three_g and price_range
plt.figure(figsize=(8, 6))
sns.countplot(x='three_g', hue='price_range', data=df, palette='viridis')
plt.xlabel('3G Connectivity')
plt.ylabel('Count')
plt.title('Relationship between 3G Connectivity and Price Range')
plt.legend(title='Price Range', labels=['Low Cost', 'Medium Cost', 'High Cost', 'Very High Cost'])
plt.show()
# Create a count plot to visualize the relationship between four_g and price_range
plt.figure(figsize=(8, 6))
sns.countplot(x='four_g', hue='price_range', data=df, palette='viridis')
plt.xlabel('4G Connectivity')
plt.ylabel('Count')
plt.title('Relationship between 4G Connectivity and Price Range')
plt.legend(title='Price Range', labels=['Low Cost', 'Medium Cost', 'High Cost', 'Very High Cost'])
plt.show()
# Visualize the relationship between 'sc_h' and 'price_range' using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='price_range', y='sc_h', data=df)
plt.xlabel('Price Range')
plt.ylabel('Screen Height (cm)')
plt.title('Relationship between Screen Height and Price Range')
plt.show()
# Visualize the relationship between 'sc_w' and 'price_range' using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='price_range', y='sc_w', data=df)
plt.xlabel('Price Range')
plt.ylabel('Screen Width (cm)')
plt.title('Relationship between Screen Width and Price Range')
plt.show()
# Visualize the relationship between 'touch_screen' and 'price_range' using a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x='touch_screen', y='price_range', data=df)
plt.xlabel('Touch Screen')
plt.ylabel('Price Range')
plt.title('Relationship between Touch Screen and Price Range')
plt.show()
# Calculate the correlation matrix
correlation_matrix = df.corr()
# Create a heatmap for the correlation matrix
plt.figure(figsize=(18, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# Features and target variable
X_1 = df[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']]
y_1 = df['price_range']
# Split the data into training and testing sets
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
# Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_1_train, y_1_train)
# Get feature importances and sort them in descending order
feature_importances = random_forest_model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]
sorted_features = X_1.columns[sorted_indices]
sorted_importances = feature_importances[sorted_indices]
# Visualize feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x=sorted_importances, y=sorted_features)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance in Predicting Price Range')
plt.show()
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
# Separate features (X) and target variable (y)
X = df.drop('price_range', axis=1)
y = df['price_range']
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
SVC(kernel='linear', random_state=42)
# Predict on the test data
y_pred = svm_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
print(X_test)
battery_power blue clock_speed dual_sim fc four_g int_memory \
1860 1646 0 2.5 0 3 1 25
353 1182 0 0.5 0 7 1 8
1333 1972 0 2.9 0 9 0 14
905 989 1 2.0 0 4 0 17
1289 615 1 0.5 1 7 0 58
... ... ... ... ... .. ... ...
965 1379 0 0.5 1 1 0 19
1284 991 0 2.0 0 2 1 12
1739 1044 0 1.8 0 4 1 12
261 728 0 2.7 1 0 0 25
535 1185 0 1.9 0 0 0 31
m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w \
1860 0.6 200 2 5 211 1608 686 8 6
353 0.5 138 8 16 275 986 2563 19 17
1333 0.4 196 7 18 293 952 1316 8 1
905 0.2 166 3 19 256 1394 3892 18 7
1289 0.5 130 5 8 1021 1958 1906 14 5
... ... ... ... .. ... ... ... ... ...
965 0.3 134 8 17 387 671 3912 11 2
1284 0.3 158 5 6 1209 1678 2014 11 9
1739 0.7 104 6 5 1230 1263 1794 18 7
261 0.2 88 4 1 526 1529 2039 5 1
535 0.4 152 8 7 837 1642 2447 16 2
talk_time three_g touch_screen wifi
1860 11 1 1 0
353 19 1 0 0
1333 8 1 1 0
905 19 1 1 0
1289 5 1 0 0
... ... ... ... ...
965 19 0 1 1
1284 10 1 0 0
1739 19 1 1 1
261 12 1 1 1
535 3 1 1 1
[400 rows x 20 columns]
print(y_pred)
[0 2 1 3 1 1 2 0 3 1 0 1 2 3 3 2 3 3 1 0 0 2 1 2 0 1 3 2 2 0 0 0 3 0 1 1 2 0 3 0 2 3 2 0 2 3 2 1 3 1 3 1 0 0 1 1 1 3 0 0 1 3 3 1 0 0 3 3 1 2 2 2 0 1 2 0 1 3 2 2 3 2 1 0 1 3 1 3 3 0 3 3 2 1 3 2 2 3 1 1 0 0 1 0 1 3 2 0 1 1 0 0 3 1 3 2 3 2 0 2 1 3 2 1 3 3 0 2 0 2 3 0 2 2 0 3 1 0 0 2 2 1 2 2 0 0 0 1 1 2 3 1 1 0 2 2 0 1 0 2 2 3 3 3 1 0 1 2 2 3 3 0 1 0 3 1 1 2 1 0 0 0 0 0 3 2 0 3 0 0 0 0 1 3 3 1 0 1 1 1 1 2 2 3 3 3 1 2 0 0 0 2 1 1 3 1 1 2 1 1 3 2 3 0 0 2 1 3 0 1 2 0 2 3 2 0 1 3 3 0 1 3 3 3 0 3 1 2 3 3 2 1 1 3 3 1 3 3 3 3 3 0 1 2 2 1 3 0 2 3 2 2 2 1 0 1 0 3 3 1 3 1 1 3 1 2 0 0 3 0 1 2 3 3 3 1 1 0 1 3 3 0 1 2 2 0 3 3 2 3 2 3 2 0 2 1 1 1 0 0 0 3 3 3 1 0 1 0 1 2 3 0 3 3 2 1 3 0 0 2 1 3 2 0 1 1 1 1 1 3 2 0 0 3 3 0 3 0 0 2 0 1 2 2 2 3 0 3 2 3 3 3 3 2 1 1 0 3 1 3 3 0 2 3 2 3 3 3 0 0 2 3 0 0 2 3 2 1 1 2]
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_str)
Accuracy: 0.97
Classification Report:
precision recall f1-score support
0 1.00 0.94 0.97 105
1 0.91 1.00 0.95 91
2 0.99 0.95 0.97 92
3 0.98 0.99 0.99 112
accuracy 0.97 400
macro avg 0.97 0.97 0.97 400
weighted avg 0.97 0.97 0.97 400
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_model_scaled = SVC(kernel='linear', C=1.0, random_state=42)
svm_model_scaled.fit(X_train_scaled, y_train)
SVC(kernel='linear', random_state=42)
# Predict on the test data
y_pred_scaled = svm_model.predict(X_test_scaled)
# Evaluate the model
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
classification_report_str_scaled = classification_report(y_test, y_pred_scaled)
print("Using StandardScaler:")
print(f"Accuracy: {accuracy_scaled:.2f}")
print("Classification Report:")
print(classification_report_str_scaled)
Using StandardScaler:
Accuracy: 0.26
Classification Report:
precision recall f1-score support
0 0.26 1.00 0.42 105
1 0.00 0.00 0.00 91
2 0.00 0.00 0.00 92
3 0.00 0.00 0.00 112
accuracy 0.26 400
macro avg 0.07 0.25 0.10 400
weighted avg 0.07 0.26 0.11 400
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_mms_scaled = mms.fit_transform(X_train)
X_test_mms_scaled = mms.transform(X_test)
svm_model_mms = SVC(kernel='linear', C=1.0, random_state=42)
svm_model_mms.fit(X_train_mms_scaled, y_train)
y_pred_mms = svm_model_mms.predict(X_test_mms_scaled)
accuracy_mms = accuracy_score(y_test, y_pred_mms)
classification_report_str_mms = classification_report(y_test, y_pred_mms)
print("Using MinMaxScaler:")
print(f"Accuracy: {accuracy_mms:.2f}")
print("Classification Report:")
print(classification_report_str_mms)
Using MinMaxScaler:
Accuracy: 0.96
Classification Report:
precision recall f1-score support
0 1.00 0.97 0.99 105
1 0.92 1.00 0.96 91
2 0.95 0.90 0.93 92
3 0.96 0.96 0.96 112
accuracy 0.96 400
macro avg 0.96 0.96 0.96 400
weighted avg 0.96 0.96 0.96 400
Conclusion
The high accuracy of 97% in the first case indicates that the model is able to predict the mobile phone price range with high accuracy when features are used without any scaling.
The low accuracy of 26% in the second case shows that the model's performance drastically drops when features are not scaled properly. This implies that scaling is essential for improving the model's performance.
The accuracy of 96% in the third case suggests that using MinMaxScaler helps in improving the model's performance compared to the second case but is slightly lower than the first case. MinMaxScaler scales the features to a specific range, which seems to help the model make better predictions.
Overall, scaling the features using appropriate scalers can have a significant impact on the model's performance, and it is essential to choose the right scaling method depending on the data and the machine learning algorithm being used.